main%20Image.jpg

Importing Libraries:¶

In [1]:
import pandas as pd 
import numpy as np 
import plotly
import matplotlib.pyplot as plt 
import seaborn as sns 

import missingno as msno
import warnings
warnings.filterwarnings("ignore")

Exploratory Data Analysis¶

In [2]:
df = pd.read_csv("water_potability.csv")
df
Out[2]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
0 NaN 204.890455 20791.318981 7.300212 368.516441 564.308654 10.379783 86.990970 2.963135 0
1 3.716080 129.422921 18630.057858 6.635246 NaN 592.885359 15.180013 56.329076 4.500656 0
2 8.099124 224.236259 19909.541732 9.275884 NaN 418.606213 16.868637 66.420093 3.055934 0
3 8.316766 214.373394 22018.417441 8.059332 356.886136 363.266516 18.436524 100.341674 4.628771 0
4 9.092223 181.101509 17978.986339 6.546600 310.135738 398.410813 11.558279 31.997993 4.075075 0
... ... ... ... ... ... ... ... ... ... ...
3271 4.668102 193.681735 47580.991603 7.166639 359.948574 526.424171 13.894419 66.687695 4.435821 1
3272 7.808856 193.553212 17329.802160 8.061362 NaN 392.449580 19.903225 NaN 2.798243 1
3273 9.419510 175.762646 33155.578218 7.350233 NaN 432.044783 11.039070 69.845400 3.298875 1
3274 5.126763 230.603758 11983.869376 6.303357 NaN 402.883113 11.168946 77.488213 4.708658 1
3275 7.874671 195.102299 17404.177061 7.509306 NaN 327.459760 16.140368 78.698446 2.309149 1

3276 rows × 10 columns

In [3]:
df.describe()
Out[3]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
count 2785.000000 3276.000000 3276.000000 3276.000000 2495.000000 3276.000000 3276.000000 3114.000000 3276.000000 3276.000000
mean 7.080795 196.369496 22014.092526 7.122277 333.775777 426.205111 14.284970 66.396293 3.966786 0.390110
std 1.594320 32.879761 8768.570828 1.583085 41.416840 80.824064 3.308162 16.175008 0.780382 0.487849
min 0.000000 47.432000 320.942611 0.352000 129.000000 181.483754 2.200000 0.738000 1.450000 0.000000
25% 6.093092 176.850538 15666.690297 6.127421 307.699498 365.734414 12.065801 55.844536 3.439711 0.000000
50% 7.036752 196.967627 20927.833607 7.130299 333.073546 421.884968 14.218338 66.622485 3.955028 0.000000
75% 8.062066 216.667456 27332.762127 8.114887 359.950170 481.792304 16.557652 77.337473 4.500320 1.000000
max 14.000000 323.124000 61227.196008 13.127000 481.030642 753.342620 28.300000 124.000000 6.739000 1.000000
In [4]:
df.shape
Out[4]:
(3276, 10)
In [5]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2785 non-null   float64
 1   Hardness         3276 non-null   float64
 2   Solids           3276 non-null   float64
 3   Chloramines      3276 non-null   float64
 4   Sulfate          2495 non-null   float64
 5   Conductivity     3276 non-null   float64
 6   Organic_carbon   3276 non-null   float64
 7   Trihalomethanes  3114 non-null   float64
 8   Turbidity        3276 non-null   float64
 9   Potability       3276 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 256.1 KB
In [6]:
df.nunique()
Out[6]:
ph                 2785
Hardness           3276
Solids             3276
Chloramines        3276
Sulfate            2495
Conductivity       3276
Organic_carbon     3276
Trihalomethanes    3114
Turbidity          3276
Potability            2
dtype: int64
In [7]:
df['Potability'].value_counts()
Out[7]:
Potability
0    1998
1    1278
Name: count, dtype: int64

There is an imbalance of data for the target variable. Non poptable samples have been oversampled versus the potatble samples.

Count PLot¶

In [8]:
sns.countplot(data = df, x = 'Potability')
Out[8]:
<Axes: xlabel='Potability', ylabel='count'>

Pie Plot/chart¶

In [9]:
df.Potability.value_counts().plot(kind ='pie')
Out[9]:
<Axes: ylabel='count'>

Dist Plot¶

In [10]:
non_potable = df[df['Potability'] == 0]
potable = df[df['Potability'] == 1]

plt.figure(figsize=(13,13))

for ax,col in enumerate(df.columns[:9]):
    plt.subplot(3,3,ax+1)
    plt.title(col)
    sns.kdeplot(x=non_potable[col],label = "Non Potable")
    sns.kdeplot(x=potable[col],label = "Potable")
    plt.legend()

plt.suptitle('Water Quality Distribution',y=1.01, size = 16, color = 'black', weight='bold')   
plt.tight_layout()
In [11]:
df.drop('Potability', axis=1).skew()
Out[11]:
ph                 0.025630
Hardness          -0.039342
Solids             0.621634
Chloramines       -0.012098
Sulfate           -0.035947
Conductivity       0.264490
Organic_carbon     0.025533
Trihalomethanes   -0.083031
Turbidity         -0.007817
dtype: float64

Distirbution of the data is mostly Gaussian.

In [12]:
df.corr()
Out[12]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
ph 1.000000 0.082096 -0.089288 -0.034350 0.018203 0.018614 0.043503 0.003354 -0.039057 -0.003556
Hardness 0.082096 1.000000 -0.046899 -0.030054 -0.106923 -0.023915 0.003610 -0.013013 -0.014449 -0.013837
Solids -0.089288 -0.046899 1.000000 -0.070148 -0.171804 0.013831 0.010242 -0.009143 0.019546 0.033743
Chloramines -0.034350 -0.030054 -0.070148 1.000000 0.027244 -0.020486 -0.012653 0.017084 0.002363 0.023779
Sulfate 0.018203 -0.106923 -0.171804 0.027244 1.000000 -0.016121 0.030831 -0.030274 -0.011187 -0.023577
Conductivity 0.018614 -0.023915 0.013831 -0.020486 -0.016121 1.000000 0.020966 0.001285 0.005798 -0.008128
Organic_carbon 0.043503 0.003610 0.010242 -0.012653 0.030831 0.020966 1.000000 -0.013274 -0.027308 -0.030001
Trihalomethanes 0.003354 -0.013013 -0.009143 0.017084 -0.030274 0.001285 -0.013274 1.000000 -0.022145 0.007130
Turbidity -0.039057 -0.014449 0.019546 0.002363 -0.011187 0.005798 -0.027308 -0.022145 1.000000 0.001581
Potability -0.003556 -0.013837 0.033743 0.023779 -0.023577 -0.008128 -0.030001 0.007130 0.001581 1.000000

Heatmap¶

In [13]:
plt.figure(figsize = (12,10))
cmap = sns.diverging_palette(250, 30, l=65, center="dark", as_cmap=True)
sns.heatmap(df.corr(), annot = True, cmap=cmap)
plt.suptitle('Water Quality Heat Map', y = 0.92, size = 16, color = 'black', weight='bold')
Out[13]:
Text(0.5, 0.92, 'Water Quality Heat Map')
In [14]:
df.corr().abs()['Potability'].sort_values(ascending = False)
Out[14]:
Potability         1.000000
Solids             0.033743
Organic_carbon     0.030001
Chloramines        0.023779
Sulfate            0.023577
Hardness           0.013837
Conductivity       0.008128
Trihalomethanes    0.007130
ph                 0.003556
Turbidity          0.001581
Name: Potability, dtype: float64

Pairplot¶

In [15]:
ax = sns.pairplot(df, hue = "Potability", diag_kind = "kde", kind = "scatter")
ax.fig.set_size_inches(16,16)
ax.fig.suptitle('Water Quality Pair-Plot',y=1.01, size = 16, color = 'black', weight='bold')
Out[15]:
Text(0.5, 1.01, 'Water Quality Pair-Plot')

Both the correlation heatmap and pairplots between the variables show that there are no clear linear relationships between the features and the target variable. We can rule out using a simple linear model.

Box Plot¶

In [16]:
fig, ax = plt.subplots(ncols=5, nrows=2, figsize=(30,10))
index = 0
ax = ax.flatten()

for attributeCol, value in df.items():
    sns.boxplot(y=attributeCol, data=df, ax=ax[index], color="#cf7849")
    index += 1
plt.tight_layout(pad=1.5, w_pad=1.5, h_pad=5.0)

plt.suptitle('Water Quality Box-Plot',y=1.01, size = 16, color = 'black', weight='bold')
Out[16]:
Text(0.5, 1.01, 'Water Quality Box-Plot')

There are outliers within every feature of the dataset.

Dealing with Missing Values¶

In [17]:
df.isna().any().any()
Out[17]:
True
In [18]:
df.isnull().sum()
Out[18]:
ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64
In [19]:
df.isnull().mean().plot.bar(figsize=(12,6), color="#cf7849") 
plt.ylabel('Percentage of missing values') 
plt.xlabel('Features') 
plt.title('Missing Data in Percentages');
In [20]:
df.isnull().mean()*100 
Out[20]:
ph                 14.987790
Hardness            0.000000
Solids              0.000000
Chloramines         0.000000
Sulfate            23.840049
Conductivity        0.000000
Organic_carbon      0.000000
Trihalomethanes     4.945055
Turbidity           0.000000
Potability          0.000000
dtype: float64
In [21]:
df[df['Potability']==0][['ph','Sulfate','Trihalomethanes']].mean()
Out[21]:
ph                   7.085378
Sulfate            334.564290
Trihalomethanes     66.303555
dtype: float64
In [22]:
df[df['Potability']==1][['ph','Sulfate','Trihalomethanes']].mean()
Out[22]:
ph                   7.073783
Sulfate            332.566990
Trihalomethanes     66.539684
dtype: float64

The three features with missing values all have Gaussian distribution therefore it is a safe assumption to use the mean of each feature to fill in for the missing values.

In [23]:
df1 = df.copy()
df1['ph'].fillna(value=df1['ph'].mean(),inplace=True)
df1['Sulfate'].fillna(value=df1['Sulfate'].mean(),inplace=True)
df1['Trihalomethanes'].fillna(value=df1['Trihalomethanes'].mean(),inplace=True)
In [24]:
df1.isnull().sum()
Out[24]:
ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64

Dealing with Outliers¶

Since the features have Gaussian distribution we can trim out the outliers data outside of 3 standard deviations.

In [25]:
from scipy import stats

df_water = df1.copy()
df_water = df_water[(np.abs(stats.zscore(df1)) <= 3).all(axis=1)]
In [26]:
df_water.shape
Out[26]:
(3128, 10)

Box Plot ( After Dealing With Outliers )¶

In [27]:
fig, ax = plt.subplots(ncols=5, nrows=2, figsize=(30,10))
index = 0
ax = ax.flatten()

for attributeCol, value in df_water.items():
    sns.boxplot(y=attributeCol, data=df_water, ax=ax[index], color='#597eb3')
    index += 1
plt.tight_layout(pad=1.5, w_pad=1.5, h_pad=5.0)
plt.suptitle('Water Quality Box-Plot (after outlier trimming)',y=1.01, size = 16, color = 'black', weight='bold')
Out[27]:
Text(0.5, 1.01, 'Water Quality Box-Plot (after outlier trimming)')

Dealing with Class Imbalance¶

In [28]:
count_class_0, count_class_1 = df_water.Potability.value_counts()

df_class_0 = df_water[df_water['Potability'] == 0]
df_class_1 = df_water[df_water['Potability'] == 1]
In [29]:
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)

print('Random over-sampling:')
print(df_test_over.Potability.value_counts())

df_test_over.Potability.value_counts().plot(kind='pie', title='Count (target)');
Random over-sampling:
Potability
0    1930
1    1930
Name: count, dtype: int64

Preprocessing: Train-test split and standardized scaler¶

In [30]:
from sklearn.model_selection import train_test_split

X = df_test_over.drop('Potability',axis=1)
y = df_test_over['Potability']

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [31]:
from sklearn.preprocessing import StandardScaler

scale = StandardScaler()
X_train=scale.fit_transform(X_train)
X_test=scale.transform(X_test)
In [32]:
print("X_train:", X_train.shape, "\ny_train:", y_train.shape)
print("X_test:", X_test.shape, "\ny_test:", y_test.shape)
X_train: (3088, 9) 
y_train: (3088,)
X_test: (772, 9) 
y_test: (772,)

Exploring dimensionalty reduction using PCA:¶

In [33]:
from sklearn.decomposition import PCA

pca = PCA()

X_train_pca = pca.fit_transform(X_train)
exp_var_pca = pca.explained_variance_ratio_

cum_sum_eigenvalues = np.cumsum(exp_var_pca)

plt.bar(range(0,len(exp_var_pca)), exp_var_pca, alpha=0.5, align='center', label='Individual explained variance')
plt.step(range(0,len(cum_sum_eigenvalues)), cum_sum_eigenvalues, where='mid', label='Cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal component index')
plt.title("Water Quality PCA")
plt.legend(loc='best')
plt.show()

PCA tells us that all the features are independent and confirm our previous observations from the correlation matrix and heatmap that there are not linear relationships between the features. This PCA analysis of explained variance ratio tells us that we would still need 8 components to explain 80% of the variance. Therefore, dimensionality reduction will not be beneficial with this dataset.

Model Training -- First Iteration¶

In [34]:
from sklearn.metrics import precision_score, accuracy_score, mean_absolute_error, mean_squared_error, confusion_matrix, classification_report, recall_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

1. Logistic Regression -- First Iteration¶

In [35]:
LogReg = LogisticRegression()
LogReg.fit(X_train, y_train)
LogReg_pred = LogReg.predict(X_test)

LogReg_acc = accuracy_score(y_test, LogReg_pred)
print("The accuracy for Logistic Regression is", LogReg_acc)
print("The classification report using Logistic Regression is:")
print(classification_report(y_test, LogReg_pred))
The accuracy for Logistic Regression is 0.45466321243523317
The classification report using Logistic Regression is:
              precision    recall  f1-score   support

           0       0.47      0.40      0.43       400
           1       0.44      0.51      0.48       372

    accuracy                           0.45       772
   macro avg       0.46      0.46      0.45       772
weighted avg       0.46      0.45      0.45       772

In [36]:
LogReg_cm = confusion_matrix(y_test, LogReg_pred)
sns.heatmap(LogReg_cm/np.sum(LogReg_cm), annot = True, fmt = '0.2%', cmap = 'Oranges')
plt.title("Logisitic Regression Confusion Matrix")
Out[36]:
Text(0.5, 1.0, 'Logisitic Regression Confusion Matrix')

2. K-Nearest Neighbour Regression -- First Iteration¶

In [37]:
KNN = KNeighborsClassifier()
KNN.fit(X_train, y_train)
KNN_pred = KNN.predict(X_test)

KNN_acc = accuracy_score(y_test, KNN_pred)
print("The accuracy for KNeighbors is", KNN_acc)
print("The classification report using KNeighbors is:", KNN_acc)
print(classification_report(y_test, KNN_pred))
The accuracy for KNeighbors is 0.655440414507772
The classification report using KNeighbors is: 0.655440414507772
              precision    recall  f1-score   support

           0       0.69      0.60      0.64       400
           1       0.62      0.72      0.67       372

    accuracy                           0.66       772
   macro avg       0.66      0.66      0.66       772
weighted avg       0.66      0.66      0.65       772

In [38]:
KNN_cm = confusion_matrix(y_test, KNN_pred)
sns.heatmap(KNN_cm/np.sum(KNN_cm), annot = True, fmt = '0.2%', cmap = 'Oranges')
plt.title("KNN Regression Confusion Matrix")
Out[38]:
Text(0.5, 1.0, 'KNN Regression Confusion Matrix')

3. Decision Tree Classifier -- First Iteration¶

In [39]:
DecTree = DecisionTreeClassifier()
DecTree.fit(X_train, y_train)
DecTree_pred = DecTree.predict(X_test)

DecTree_acc = accuracy_score(y_test, DecTree_pred)
print("The accuracy for Decision Tree is", DecTree_acc)
print("The classification report using Decision Tree is:")
print(classification_report(y_test, DecTree_pred))
The accuracy for Decision Tree is 0.7797927461139896
The classification report using Decision Tree is:
              precision    recall  f1-score   support

           0       0.84      0.71      0.77       400
           1       0.73      0.85      0.79       372

    accuracy                           0.78       772
   macro avg       0.79      0.78      0.78       772
weighted avg       0.79      0.78      0.78       772

In [40]:
DecTree_cm = confusion_matrix(y_test, DecTree_pred)
sns.heatmap(DecTree_cm/np.sum(DecTree_cm), annot = True, fmt = '0.2%', cmap = 'Oranges')
plt.title("Decision Tree Confusion Matrix")
Out[40]:
Text(0.5, 1.0, 'Decision Tree Confusion Matrix')

4. Random Forest Classifier -- First Iteration¶

In [41]:
RFTree = RandomForestClassifier()
RFTree.fit(X_train, y_train)
RFTree_pred = RFTree.predict(X_test)

RFTree_acc = accuracy_score(y_test, RFTree_pred)
print("The accuracy for Random Forest is", RFTree_acc)
print("The classification report using Random Forest is:")
print(classification_report(y_test, RFTree_pred))
The accuracy for Random Forest is 0.8238341968911918
The classification report using Random Forest is:
              precision    recall  f1-score   support

           0       0.83      0.83      0.83       400
           1       0.82      0.81      0.82       372

    accuracy                           0.82       772
   macro avg       0.82      0.82      0.82       772
weighted avg       0.82      0.82      0.82       772

In [42]:
RFTree_cm = confusion_matrix(y_test, RFTree_pred)
sns.heatmap(RFTree_cm/np.sum(RFTree_cm), annot = True, fmt = '0.2%', cmap = 'Oranges')
plt.title("Random Forest Confusion Matrix")
Out[42]:
Text(0.5, 1.0, 'Random Forest Confusion Matrix')

5. Support Vector Machine Classifier -- First Iteration¶

In [43]:
SVM = SVC()
SVM.fit(X_train, y_train)
SVM_pred = SVM.predict(X_test)

SVM_acc = accuracy_score(y_test, SVM_pred)
print("The accuracy for SVM is", SVM_acc)
print("The classification report using SVM is:", SVM_acc)
print(classification_report(y_test, SVM_pred))
The accuracy for SVM is 0.6722797927461139
The classification report using SVM is: 0.6722797927461139
              precision    recall  f1-score   support

           0       0.67      0.71      0.69       400
           1       0.67      0.63      0.65       372

    accuracy                           0.67       772
   macro avg       0.67      0.67      0.67       772
weighted avg       0.67      0.67      0.67       772

In [44]:
SVM_cm = confusion_matrix(y_test, SVM_pred)
sns.heatmap(SVM_cm/np.sum(SVM_cm), annot = True, fmt = '0.2%', cmap = 'Oranges')
plt.title("SVM Confusion Matrix")
Out[44]:
Text(0.5, 1.0, 'SVM Confusion Matrix')

6. XGBoost Classifier -- First Interation¶

In [45]:
XGB = XGBClassifier()
XGB.fit(X_train, y_train)
XGB_pred = XGB.predict(X_test)

XGB_acc = accuracy_score(y_test, XGB_pred)
print("The accuracy for XGBoost is", XGB_acc)
print("The classification report using XGBoost is:", XGB_acc)
print(classification_report(y_test, XGB_pred))
The accuracy for XGBoost is 0.7940414507772021
The classification report using XGBoost is: 0.7940414507772021
              precision    recall  f1-score   support

           0       0.83      0.76      0.79       400
           1       0.76      0.84      0.80       372

    accuracy                           0.79       772
   macro avg       0.80      0.80      0.79       772
weighted avg       0.80      0.79      0.79       772

In [46]:
XGB_cm = confusion_matrix(y_test, XGB_pred)
sns.heatmap(XGB_cm/np.sum(XGB_cm), annot = True, fmt = '0.2%', cmap = 'Oranges')
plt.title("XGBoost Confusion Matrix")
Out[46]:
Text(0.5, 1.0, 'XGBoost Confusion Matrix')

Algorithm Comparison -- First Iteration¶

In [47]:
models = pd.DataFrame({
    'Model':['Logistic Regression', 'KNN Regression', 'Decision Tree', 'Random Forest', 'Support Vector', 'XGBoost'],
    'Accuracy' :[LogReg_acc, KNN_acc, DecTree_acc, RFTree_acc, SVM_acc, XGB_acc]
})
models = models.sort_values(by='Accuracy', ascending=False)
In [48]:
models
Out[48]:
Model Accuracy
3 Random Forest 0.823834
5 XGBoost 0.794041
2 Decision Tree 0.779793
4 Support Vector 0.672280
1 KNN Regression 0.655440
0 Logistic Regression 0.454663
In [49]:
fig = plt.figure(figsize=(15,9))

ax = sns.barplot(data=models, 
                 y='Model', 
                 x='Accuracy',
                 palette = 'Oranges_r')

ax.figure.suptitle('Performance of Models -- First Iteration',y=0.91, size = 16, color = 'black', weight='bold');

plt.xlabel('Accuracy')
plt.ylabel('Models')
plt.axvline(x = 0.8, ymin = 0, ymax = 1,
            linewidth = 2, linestyle =":",
            color ='#cf7849');

After the first iteration of model training we can see that Logistic Regression performed the lowest while KNN, SVM and XGBoost had close accuracy values around 65%. Decision tree classifier had a 76% accuracy while Random Forest was able to be accurate to 83%

Hyper-tunning parameters¶

In [50]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

lgr = LogisticRegression()
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
svc = SVC()
xgb =XGBClassifier(eval_metric = 'logloss', use_label_encoder=False)
In [51]:
para_lgr = {'solver':['newton-cg', 'lbfgs', 'sag', 'saga', 'liblinear'], 'penalty': ['l1', 'l2', 'elasticnet', 'none']}
grid_lgr = GridSearchCV(lgr, param_grid=para_lgr, cv=5)
grid_lgr.fit(X_train, y_train)
print("Best parameters for Logistic Regression:", grid_lgr.best_params_)
Best parameters for Logistic Regression: {'penalty': 'l1', 'solver': 'saga'}
In [52]:
para_knn = {'n_neighbors':np.arange(1, 50), 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}
grid_knn = GridSearchCV(knn, param_grid=para_knn, cv=5)
grid_knn.fit(X_train, y_train)
print("Best parameters for KNN:", grid_knn.best_params_)
Best parameters for KNN: {'algorithm': 'auto', 'n_neighbors': 42, 'weights': 'distance'}
In [53]:
para_dt = {'criterion':['gini','entropy'],'max_depth':np.arange(1, 50), 'min_samples_leaf':[1,2,4,5,10,20,30,40,80,100]}
grid_dt = GridSearchCV(dt, param_grid=para_dt, cv=5) 
grid_dt.fit(X_train, y_train)
print("Best parameters for Decision Tree:", grid_dt.best_params_)
Best parameters for Decision Tree: {'criterion': 'entropy', 'max_depth': 41, 'min_samples_leaf': 1}
In [54]:
params_rf = {'n_estimators':[100,200, 350, 500], 'min_samples_leaf':[2, 10, 30]}
grid_rf = GridSearchCV(rf, param_grid=params_rf, cv=5)
grid_rf.fit(X_train, y_train)
print("Best parameters for Random Forest:", grid_rf.best_params_)
Best parameters for Random Forest: {'min_samples_leaf': 2, 'n_estimators': 350}
In [55]:
para_svc = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}
grid_svc = GridSearchCV(svc, param_grid=para_svc, cv=5)
grid_svc.fit(X_train, y_train)
print("Best parameters for SVM:", grid_svc.best_params_)
Best parameters for SVM: {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
In [56]:
params_xgb = {'n_estimators': [50,100,250,400,600,800,1000], 'learning_rate': [0.2,0.5,0.8,1]}
rs_xgb =  RandomizedSearchCV(xgb, param_distributions=params_xgb, cv=5)
rs_xgb.fit(X_train, y_train)
print("Best parameters for XGBoost:", rs_xgb.best_params_)
Best parameters for XGBoost: {'n_estimators': 600, 'learning_rate': 0.5}

Predictive Modeling/Classication -- Second Iteration:¶

After hypertuning¶

1. Logistic Regression -- Second Iteration¶

In [57]:
LogReg2 = LogisticRegression(penalty = 'l1', solver= 'saga')
LogReg2.fit(X_train, y_train)
LogReg2_pred = LogReg2.predict(X_test)

LogReg2_acc = accuracy_score(y_test, LogReg2_pred)
print("The accuracy for Logistic Regression is", LogReg2_acc)
print("The classification report using Logistic Regression is:")
print(classification_report(y_test, LogReg2_pred))
The accuracy for Logistic Regression is 0.45466321243523317
The classification report using Logistic Regression is:
              precision    recall  f1-score   support

           0       0.47      0.40      0.43       400
           1       0.44      0.51      0.47       372

    accuracy                           0.45       772
   macro avg       0.46      0.46      0.45       772
weighted avg       0.46      0.45      0.45       772

In [58]:
LogReg2_cm = confusion_matrix(y_test, LogReg2_pred)
sns.heatmap(LogReg2_cm/np.sum(LogReg2_cm), annot = True, fmt = '0.2%', cmap = 'Blues')
plt.title("Logisitic Regression Confusion Matrix")
Out[58]:
Text(0.5, 1.0, 'Logisitic Regression Confusion Matrix')

2. K-Nearest Neighbour Regression -- Second Iteration¶

In [59]:
KNN2 = KNeighborsClassifier(algorithm = 'auto', n_neighbors = 42, weights ='distance')
KNN2.fit(X_train, y_train)
KNN2_pred = KNN2.predict(X_test)

KNN2_acc = accuracy_score(y_test, KNN2_pred)
print("The accuracy for KNeighbors is", KNN2_acc)
print("The classification report using KNeighbors is:", KNN2_acc)
print(classification_report(y_test, KNN2_pred))
The accuracy for KNeighbors is 0.772020725388601
The classification report using KNeighbors is: 0.772020725388601
              precision    recall  f1-score   support

           0       0.82      0.72      0.77       400
           1       0.73      0.83      0.78       372

    accuracy                           0.77       772
   macro avg       0.78      0.77      0.77       772
weighted avg       0.78      0.77      0.77       772

In [60]:
KNN2_cm = confusion_matrix(y_test, KNN2_pred)
sns.heatmap(KNN2_cm/np.sum(KNN2_cm), annot = True, fmt = '0.2%', cmap = 'Blues')
plt.title("KNN Regression Confusion Matrix")
Out[60]:
Text(0.5, 1.0, 'KNN Regression Confusion Matrix')

3. Decision Tree Classifier -- Second Iteration¶

In [61]:
DecTree2 = DecisionTreeClassifier(criterion = 'entropy', max_depth = 41, min_samples_leaf = 1)
DecTree2.fit(X_train, y_train)
DecTree2_pred = DecTree2.predict(X_test)

DecTree2_acc = accuracy_score(y_test, DecTree2_pred)
print("The accuracy for Decision Tree is", DecTree2_acc)
print("The classification report using Decision Tree is:")
print(classification_report(y_test, DecTree2_pred))
The accuracy for Decision Tree is 0.7797927461139896
The classification report using Decision Tree is:
              precision    recall  f1-score   support

           0       0.84      0.71      0.77       400
           1       0.73      0.85      0.79       372

    accuracy                           0.78       772
   macro avg       0.79      0.78      0.78       772
weighted avg       0.79      0.78      0.78       772

In [62]:
DecTree2_cm = confusion_matrix(y_test, DecTree2_pred)
sns.heatmap(DecTree2_cm/np.sum(DecTree2_cm), annot = True, fmt = '0.2%', cmap = 'Blues')
plt.title("Decision Tree Confusion Matrix")
Out[62]:
Text(0.5, 1.0, 'Decision Tree Confusion Matrix')

4. Random Forest Classifier -- Second Iteration¶

In [63]:
RFTree2 = RandomForestClassifier(min_samples_leaf = 2, n_estimators = 350)
RFTree2.fit(X_train, y_train)
RFTree2_pred = RFTree2.predict(X_test)

RFTree2_acc = accuracy_score(y_test, RFTree2_pred)
print("The accuracy for Random Forest is", RFTree2_acc)
print("The classification report using Random Forest is:")
print(classification_report(y_test, RFTree2_pred))
The accuracy for Random Forest is 0.8354922279792746
The classification report using Random Forest is:
              precision    recall  f1-score   support

           0       0.84      0.84      0.84       400
           1       0.83      0.83      0.83       372

    accuracy                           0.84       772
   macro avg       0.84      0.84      0.84       772
weighted avg       0.84      0.84      0.84       772

In [64]:
RFTree2_cm = confusion_matrix(y_test, RFTree2_pred)
sns.heatmap(RFTree2_cm/np.sum(RFTree2_cm), annot = True, fmt = '0.2%', cmap = 'Blues')
plt.title("Random Forest Confusion Matrix")
Out[64]:
Text(0.5, 1.0, 'Random Forest Confusion Matrix')

5. Support Vector Machine Classifier -- Second Iteration¶

In [65]:
SVM2 = SVC(C = 1, gamma = 1, kernel = 'rbf')
SVM2.fit(X_train, y_train)
SVM2_pred = SVM2.predict(X_test)

SVM2_acc = accuracy_score(y_test, SVM2_pred)
print("The accuracy for SVM is", SVM2_acc)
print("The classification report using SVM is:", SVM2_acc)
print(classification_report(y_test, SVM2_pred))
The accuracy for SVM is 0.8290155440414507
The classification report using SVM is: 0.8290155440414507
              precision    recall  f1-score   support

           0       0.81      0.88      0.84       400
           1       0.86      0.77      0.81       372

    accuracy                           0.83       772
   macro avg       0.83      0.83      0.83       772
weighted avg       0.83      0.83      0.83       772

In [66]:
SVM2_cm = confusion_matrix(y_test, SVM2_pred)
sns.heatmap(SVM2_cm/np.sum(SVM2_cm), annot = True, fmt = '0.2%', cmap = 'Blues')
plt.title("SVM Confusion Matrix")
Out[66]:
Text(0.5, 1.0, 'SVM Confusion Matrix')

6. XGBoost Classifier -- Second Iteration¶

In [80]:
XGB2 = XGBClassifier(n_estimators = 600, learning_rate = 0.5)
XGB2.fit(X_train, y_train)
XGB2_pred = XGB2.predict(X_test)

XGB2_acc = accuracy_score(y_test, XGB2_pred)
print("The accuracy for XGBoost is", XGB2_acc)
print("The classification report using XGBoost is:", XGB2_acc)
print(classification_report(y_test, XGB2_pred))
The accuracy for XGBoost is 0.8018134715025906
The classification report using XGBoost is: 0.8018134715025906
              precision    recall  f1-score   support

           0       0.83      0.77      0.80       400
           1       0.77      0.83      0.80       372

    accuracy                           0.80       772
   macro avg       0.80      0.80      0.80       772
weighted avg       0.80      0.80      0.80       772

In [81]:
XGB2_cm = confusion_matrix(y_test, XGB2_pred)
sns.heatmap(XGB2_cm/np.sum(XGB2_cm), annot = True, fmt = '0.2%', cmap = 'Blues')
plt.title("XGBoost Confusion Matrix")
Out[81]:
Text(0.5, 1.0, 'XGBoost Confusion Matrix')

Comparing Models -- Second Iteration:¶

In [82]:
models2 = pd.DataFrame({
    'Model':['Logistic Regression', 'KNN Regression', 'Decision Tree', 'Random Forest', 'Support Vector', 'XGBoost'],
    'Accuracy' :[LogReg2_acc, KNN2_acc, DecTree2_acc, RFTree2_acc, SVM2_acc, XGB2_acc],
})
models2 = models2.sort_values(by='Accuracy', ascending=False)
In [83]:
models2
Out[83]:
Model Accuracy
3 Random Forest 0.835492
4 Support Vector 0.829016
5 XGBoost 0.801813
2 Decision Tree 0.779793
1 KNN Regression 0.772021
0 Logistic Regression 0.454663
In [84]:
fig = plt.figure(figsize=(15,9))

ax = sns.barplot(data=models2, 
                 y='Model', 
                 x='Accuracy',
                 palette = 'Blues_r')

ax.figure.suptitle('Performance of Models -- Second Iteration',y=0.91, size = 16, color = 'black', weight='bold');

plt.xlabel('Accuracy')
plt.ylabel('Models')
plt.axvline(x = 0.8, ymin = 0, ymax = 1,
            linewidth = 2, linestyle =":",
            color ='#cf7849');

Comparing First and Second Iterations:¶

In [85]:
comp_iterations = pd.DataFrame({
    'Model':['Logistic Regression', 'KNN Regression', 'Decision Tree', 'Random Forest', 'Support Vector', 'XGBoost'],
    '2nd Iteration' :[LogReg2_acc, KNN2_acc, DecTree2_acc, RFTree2_acc, SVM2_acc, XGB2_acc],
    '1st Iteration' :[LogReg_acc, KNN_acc, DecTree_acc, RFTree_acc, SVM_acc, XGB_acc],
})
comp_iterations
Out[85]:
Model 2nd Iteration 1st Iteration
0 Logistic Regression 0.454663 0.454663
1 KNN Regression 0.772021 0.655440
2 Decision Tree 0.779793 0.779793
3 Random Forest 0.835492 0.823834
4 Support Vector 0.829016 0.672280
5 XGBoost 0.801813 0.794041
In [86]:
comp_iterations.plot.bar(figsize=(12,6))
plt.xticks([0,1,2,3,4,5],['Logistic Regression', 'KNN Regression', 'Decision Tree', 'Random Forest', 'Support Vector', 'XGBoost'])
plt.ylabel('Percentage of Accuracy') 
plt.xlabel('Model') 
plt.title('Comparison between Iterations');

After our second iteration of model (with hypertuning) we can see that Random Forest Classifier algortihm performed the best with the highest accuracy.

Making Predictive System¶

In [97]:
# Sample Data From Dataset to check model predict correct value or not.
# this is example of " Water is Not Potable" ( Potability = 0 )

input_data = (8.316766 ,214.373394 ,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# standardize the input data
std_data = scale.transform(input_data_reshaped)
print(std_data)

prediction = RFTree2.predict(std_data)
print(prediction)

if (prediction[0] == 0):
  print('The Water is not Potable')
else:
  print('The Water is Potable')
[[ 0.88132315  0.57348019  0.01412136  0.63002583  0.66554207 -0.77528312
   1.25509008  2.20125291  0.8618897 ]]
[0]
The Water is not Potable
In [98]:
# Sample Data From Dataset to check model predict correct value or not.
# this is example of " Water is Potable" ( Potability = 1 )

input_data = (6.069616,186.659040,26138.780191,7.747547,345.700257,415.886955,12.067620,60.419921,3.669712)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# standardize the input data
std_data = scale.transform(input_data_reshaped)
print(std_data)

prediction = RFTree2.predict(std_data)
print(prediction)

if (prediction[0] == 0):
  print('The Water is not Portable')
else:
  print('The Water is Portable')
[[-0.76708686 -0.28166272  0.49714254  0.42470761  0.33940143 -0.1192583
  -0.67385627 -0.37953875 -0.38546659]]
[1]
The Water is Portable
In [77]:
#Saving Sciketlearn Model
import joblib
joblib.dump ( SVM2 , "Water_Potability_model.pkl")
Out[77]:
['Water_Portability_model.pkl']
In [78]:
import pickle
filename = 'Water_Potability_model.sav'
pickle.dump( SVM2 , open(filename, 'wb'))
 
# some time later...
 
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result,'% Acuuracy')
0.8290155440414507 % Acuuracy
In [ ]: